Add covariates to stomach data

Author

Max Lindmark & Viktor Thunell

Published

September 5, 2024

Libraries

pkgs <- c("tidyverse", "tidylog", "sp", "raster", "devtools", "RCurl", "sdmTMB", "purrr", "terra", "ncdf4", "chron", "ncdf4", "tidyterra") 

if(length(setdiff(pkgs, rownames(installed.packages()))) > 0){

    install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
  
}

invisible(lapply(pkgs, library, character.only = T))

# Source code for map plots
devtools::source_url("https://raw.githubusercontent.com/VThunell/Lammska_cod-fr/main/R/functions/map-plot.R")

# Set path
home <- here::here()

Read data

dat <- read_csv(paste0(home, "/data/clean/stomachs3.csv"))

glimpse(dat)
Rows: 136,466
Columns: 28
$ pred_ID            <chr> "LV_1968_9_19_1_37G4_52", "LV_1973_5_11_15_37G5_100…
$ herring            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ saduria            <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.54, 0.0…
$ sprat              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ other              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ other_invert       <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.0…
$ other_fish         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fr_tot             <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.0…
$ fr_sad             <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000000000,…
$ fr_spr             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fr_her             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fr_other           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fr_other_invert    <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000000000,…
$ fr_other_fish      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ year               <dbl> 1968, 1973, 1973, 1973, 1973, 1983, 1983, 1966, 196…
$ month              <dbl> 9, 5, 5, 5, 5, 12, 12, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6…
$ day                <dbl> 19, 11, 11, 11, 11, 16, 16, 22, 22, 22, 22, 19, 19,…
$ day_of_year        <dbl> 263, 131, 131, 131, 131, 350, 350, 22, 22, 22, 22, …
$ pred_weight        <dbl> 156.25, 49.13, 49.13, 58.32, 40.96, 2160.00, 4565.3…
$ pred_length        <dbl> 25, 17, 17, 18, 16, 60, 77, 18, 25, 19, 19, 31, 33,…
$ pred_weight_source <chr> "estimated", "estimated", "estimated", "estimated",…
$ lat                <dbl> 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, 54…
$ lon                <dbl> 14.5, 15.5, 15.5, 15.5, 15.5, 15.5, 15.5, 19.5, 19.…
$ ices_rect          <chr> "37G4", "37G5", "37G5", "37G5", "37G5", "37G5", "37…
$ X                  <dbl> 467.4220, 532.5780, 532.5780, 532.5780, 532.5780, 5…
$ Y                  <dbl> 6011.453, 6011.453, 6011.453, 6011.453, 6011.453, 6…
$ data_source        <chr> "old_db", "old_db", "old_db", "old_db", "old_db", "…
$ Country            <chr> "LV", "LV", "LV", "LV", "LV", "LV", "LV", "LV", "LV…

Add depth

## Add depth

dep_raster <- terra::rast(paste0(home, "/data/Mean depth natural colour (with land).nc"))
class(dep_raster)
[1] "SpatRaster"
attr(,"package")
[1] "terra"
crs(dep_raster, proj = TRUE)
[1] "+proj=longlat +datum=WGS84 +no_defs"
plot(dep_raster)

dat$depth <- terra::extract(dep_raster, dat |> dplyr::select(lon, lat))$elevation

ggplot(dat, aes(lon, lat, color = depth*-1)) + 
  geom_point()

dat$depth <- dat$depth*-1

# TODO: these coordinates are waaay off
# This removes 8349 rows
dat <- dat |> drop_na(depth) 

dat |> 
  ggplot(aes(X*1000, Y*1000, color = depth)) + 
  geom_point() +
  NULL

hist(dat$depth)

#remove points outside the baltic, i.e. the western most points in:
dat |>
  ggplot(aes(lon, lat, color = depth)) + 
  geom_point() + 
  geom_vline(xintercept = 13) +
  coord_sf()

dat <- dat |>
  filter( lon > 13)

plot_map_fc +
  geom_point(data = dat, aes(X*1000, Y*1000, color = depth), alpha = 0.5) +
  theme_sleek(base_size = 6) + 
  geom_sf()
Warning: Removed 3480 rows containing missing values or values outside the scale range
(`geom_point()`).

Add oxy, temp, and sal from SMHI hindcast (and models of Copernicus and hindcast)

# read Hindcast data
hindenv_df <- readRDS(file = paste0(home, "/data/clean/hindcast_1961_2017.rds")) |>
  filter(year > 1962) |>
  mutate(yearmonth = (year-1963)*12+month)

dat <- dat |> mutate(yearmonth = (year-1963)*12+month)

# for the following values we need to model the hindcast
lateobs_my <- dat |> filter(year > 2017) |> distinct(yearmonth) 

# make a pred grid for those late observations
lateobs_predhind <- hindenv_df |>
  distinct(lat, lon) |>
  replicate_df(time_name = "yearmonth", time_values = pull(lateobs_my)) |>
  mutate( model = as_factor("hindcast"),
          year = floor(1963+(yearmonth/12)),
          month = yearmonth %% 12) |> # mod, i.e. the remainder of an integer divide (here month)
  add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633)

# Predict using the models for oxy, sal and temp. For reference see 1c_combine_oxysaltemp_data.qmd

# load model for oxygen
Mod_oxy <- readRDS(paste0(home, "/R/prepare-data/Mod_oxy.rds"))
# load model for salinity
Mod_sal <- readRDS(paste0(home, "/R/prepare-data/Mod_sal.rds"))
# load model for temperature
Mod_temp <- readRDS(paste0(home, "/R/prepare-data/Mod_temp.rds"))

oxypreds <- predict(Mod_oxy, newdata = lateobs_predhind) |> mutate(oxy = est) |> dplyr::select(lat, lon, year, month, yearmonth, oxy)
salpreds <- predict(Mod_sal, newdata = lateobs_predhind) |> mutate(sal = est) |> dplyr::select(lat, lon, year, month, yearmonth, sal)
temppreds <- predict(Mod_temp, newdata = lateobs_predhind) |> mutate(temp = est) |> dplyr::select(lat, lon, year, month, yearmonth, temp)

# Combine hindccast preds for 2018-2023 with hindcast (1963-2017)
hindcast_allyears <- left_join(salpreds, oxypreds) |> left_join(temppreds) |> 
  bind_rows(hindenv_df |> dplyr::select(temp, sal, oxy, lat, lon, year, month, yearmonth))
Joining with `by = join_by(lat, lon, year, month, yearmonth)`
Joining with `by = join_by(lat, lon, year, month, yearmonth)`
map(hindcast_allyears, ~sum(is.na(.))) # No NAs
$lat
[1] 0

$lon
[1] 0

$year
[1] 0

$month
[1] 0

$yearmonth
[1] 0

$sal
[1] 0

$oxy
[1] 0

$temp
[1] 0
hindcast_allyears |> # for >2018 we only have the months that we need for the observations in dat
  summarise(nobs = n(), .by = c(month, year)) |>
  arrange(year)
# A tibble: 675 × 3
   month  year  nobs
   <dbl> <dbl> <int>
 1     1  1963 18300
 2     2  1963 18300
 3     3  1963 18300
 4     4  1963 18300
 5     5  1963 18300
 6     6  1963 18300
 7     7  1963 18300
 8     8  1963 18300
 9     9  1963 18300
10    10  1963 18300
# ℹ 665 more rows
hindcast_allyears |>
  summarise(mean_oxy = mean(oxy), .by = c(year, month)) |>
  filter(month %in% c(2,3,11)) |>
  ggplot() +
  geom_line(aes(year, mean_oxy, color = factor(month)), linetype = "dashed") +
  geom_line(data = hindenv_df |> filter(month %in% c(2,3,11)) |> summarise(oxy = mean(oxy), .by = c(year, month)), aes(x = year, y = oxy, color = factor(month)))

hindcast_allyears |>
  summarise(mean_temp = mean(temp), .by = c(year, month)) |>
  filter(month %in% c(2,3,11)) |>
  ggplot() +
  geom_line(aes(year, mean_temp, color = factor(month)), linetype = "dashed") +
  geom_line(data = hindenv_df |> filter(month %in% c(2,3,11)) |> summarise(temp = mean(temp), .by = c(year, month)), aes(x = year, y = temp, color = factor(month)))

# Now use terra::extract to get oxy, sal and temp values to dat

# function for extraction based on yearmonth
ext_envdat <- function(ayearmonth)  {

  ext_y = dat |> filter(yearmonth == ayearmonth)  |> dplyr::select(lon, lat) # coords from the stomach data, can only be lon and lat for extract()
  
  hindcast_allyears |>
    filter(yearmonth == ayearmonth) |>
    as_spatraster(xycols = c(2,1), crs = "WGS84", digits = 2) |>
    terra::extract(ext_y, method = "bilinear", ID=FALSE) |> # to reduce NaNs produced when points are outside raster extent (i.e. on land) 
    dplyr::select(oxy, sal, temp) |>
    bind_cols(dat |> filter(yearmonth == ayearmonth))
    
}

dat <- dat |>
  mutate(yearmonth = (year-1963)*12+month)

# use the ext_envdat function to extract env covs values to the observations fr hindcast
dat_env <- unique(dat |> pull(yearmonth)) |> 
  map(\(x) ext_envdat(x)) |>
  list_rbind()

# Many observations lie just outside the hincast_allyears raster extent causing NA values
theNAs <- dat_env |> filter(is.na(oxy) | is.na(sal) | is.na(temp))
theNAs |> summarise(n = n(), .by= year) |> arrange(year)
  year   n
1 1967   3
2 1974 158
3 1976  20
4 1977  62
5 1978  48
6 1979 266
7 1980 131
8 1981   8
9 2013  79
# filter out those observations from the data using pred_ID
dat_na <- dat |> 
  filter(pred_ID %in% theNAs$pred_ID)

# a function to find the closest point with env covs in the env_dat
cov_na_fill <- function(napred_id)  {

  pid_lly = dat_na |> 
    filter(pred_ID == napred_id) |> dplyr::select(lon, lat, yearmonth) 

  pl = distance(pid_lly |> dplyr::select(lon, lat),
           hindcast_allyears |> filter(yearmonth == pid_lly$yearmonth) |> dplyr::select(lon,lat),
           lonlat = TRUE)
  
  dat_na |> 
    filter(pred_ID == napred_id) |> 
    bind_cols(hindcast_allyears[which.min(pl),] |> dplyr::select(oxy,temp,sal) )

}

# apply function to dat_na
dat_na_filled <- dat_na$pred_ID |>
map(\(x) cov_na_fill(x)) |>
  list_rbind() 
# no NAs left
dat_na_filled  |> filter(is.na(oxy) | is.na(sal) | is.na(temp))
# A tibble: 0 × 33
# ℹ 33 variables: pred_ID <chr>, herring <dbl>, saduria <dbl>, sprat <dbl>,
#   other <dbl>, other_invert <dbl>, other_fish <dbl>, fr_tot <dbl>,
#   fr_sad <dbl>, fr_spr <dbl>, fr_her <dbl>, fr_other <dbl>,
#   fr_other_invert <dbl>, fr_other_fish <dbl>, year <dbl>, month <dbl>,
#   day <dbl>, day_of_year <dbl>, pred_weight <dbl>, pred_length <dbl>,
#   pred_weight_source <chr>, lat <dbl>, lon <dbl>, ices_rect <chr>, X <dbl>,
#   Y <dbl>, data_source <chr>, Country <chr>, depth <dbl>, yearmonth <dbl>, …
# bind the non-NA observations with the NAs 
dat_env_all <- dat_env |> 
  filter(!is.na(oxy) | !is.na(sal) | !is.na(temp)) |>
  bind_rows(dat_na_filled)
# all are there?(!)
dat_env_all |> summarise(n()) == dat |> summarise(n())
      n()
[1,] TRUE
sum(is.na(dat_env_all$oxy))
[1] 0
# check plots
plot_map_fc +
  geom_point(data = dat_env_all, aes(X*1000, Y*1000, color = oxy), alpha = 0.5) +
  theme_sleek(base_size = 6) + 
  geom_sf() 
Warning: Removed 3480 rows containing missing values or values outside the scale range
(`geom_point()`).

plot_map_fc +
  geom_point(data = dat_env_all, aes(X*1000, Y*1000, color = temp), alpha = 0.5) +
  theme_sleek(base_size = 6) + 
  geom_sf() 
Warning: Removed 3480 rows containing missing values or values outside the scale range
(`geom_point()`).

plot_map_fc +
  geom_point(data = dat_env_all, aes(X*1000, Y*1000, color = sal), alpha = 0.5) +
  theme_sleek(base_size = 6) + 
  geom_sf() 
Warning: Removed 3480 rows containing missing values or values outside the scale range
(`geom_point()`).

dat_env <- dat_env_all |> dplyr::select(-yearmonth)

Add cod densities, only for post 1992!

# for the Mod, the fr data is instead used to scale the prediction grid
data_stats <- read_csv(paste0(home, "/data/clean/data_stats.csv")) 
Rows: 12328 Columns: 43
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): country, haul_id, ices_rect, month_year
dbl (39): year, lat, lon, quarter, month, sub_div, oxy, temp, sal, depth, X,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
m3 <- readRDS(paste0(home, file = "/R/main-analysis/m3.rds"))
hist(m3$data$density)

# d_stats is used to scale the prediction grid when predictiong cod densities
data_stats_st <- data_stats |>
  summarise(depth_mean = mean(depth, na.omit = TRUE),
            depth_sd = sd(depth),
            sal_mean = mean(sal, na.omit = TRUE),
            sal_sd = sd(sal),
            oxy_mean = mean(oxy, na.omit = TRUE),
            oxy_sd = sd(oxy),
            temp_mean = mean(temp, na.omit = TRUE),
            temp_sd = sd(temp))


df_m3 <- dat_env |>
  filter(between(year, 1993, 2021)) |>
  drop_na(oxy,
          sal,
          temp) |>
  mutate(depth = ifelse(depth < 0, 0, depth),
         depth_sc = (depth - data_stats_st$depth_mean)/data_stats_st$depth_sd,
         oxy_sc = (oxy - data_stats_st$oxy_mean)/data_stats_st$oxy_sd,
         sal_sc = (sal - data_stats_st$sal_mean)/data_stats_st$sal_sd,
         temp_sc = (temp - data_stats_st$temp_mean)/data_stats_st$temp_sd,
         depth_sq = depth_sc^2,
         temp_sq = temp_sc^2,
         year_f = as.factor(year),
         quarter_f = as.factor(1))

# Predict cod cpue density with model model from Max  
cpue_cod <- predict(m3, newdata = df_m3) |>
  mutate(log_density_cod = est) |> # on the log scale Max and Sean says!
  dplyr::select(lat, lon, log_density_cod) |>
  distinct()

dat_env2 <- dat_env |>
  left_join(cpue_cod) 
Joining with `by = join_by(lat, lon)`
Warning in left_join(dat_env, cpue_cod): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 2 of `x` matches multiple rows in `y`.
ℹ Row 480 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
hist(exp(dat_env2$log_density_cod))

Save

# Save data
saveRDS(dat_env2, paste0(home, "/data/clean/stom_env.rds"))